Atherogenic transition of vascular smooth muscle cells (SMCs) has been associated with reduced ATF3 motif activity and gene expression. To investigate whether ATF3 plays a direct regulatory role in SMC function, loss-of-function experiments were conducted by knocking down (KD) ATF3 in freshly isolated and cultured vascular SMCs from lineage-traced atherosclerotic mice (Pan et al., 2021, CVR). By examining transcriptomic and secretomic changes following ATF3 knockdown in a controlled in vitro setting, the study aimed to delineate the functional consequences of reduced ATF3 expression on SMC physiology.
Raw RNA-Seq data were retrieved from the NCBI Sequence Read Archive (SRA) under accession number PRJNA716327. This report details the bioinformatic analysis of RNA-Seq data from four mouse SMC samples comparing ATF3 knockdown (Treated) to control conditions. The workflow encompasses transcript quantification using Salmon, data aggregation and quality control with tximport and DESeq2, differential gene expression analysis using DESeq2, and functional interpretation through Gene Ontology (GO) enrichment and Gene Set Enrichment Analysis (GSEA) against KEGG and Hallmark pathway databases.
The following bash commands outline the process used to generate
transcript abundance estimates from the raw RNA-Seq reads. This involves
setting up a Salmon index using the Gencode mouse transcriptome (vM36)
and genome (GRCm39), downloading the raw FASTQ files from the SRA, and
running Salmon’s quant command for each sample.
mkdir -p index/mm_gencode_36
wget -c -O index/mm_gencode_36/genome.fa.gz ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M36/GRCm39.genome.fa.gz
wget -c -O index/mm_gencode_36/transcripts.fa.gz ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M36/gencode.vM36.transcripts.fa.gz
zcat index/mm_gencode_36/transcripts.fa.gz | grep "^>" | sed 's|[<>,]||g' >index/mm_gencode_36/transcripts.txt
gunzip -c index/mm_gencode_36/genome.fa.gz | grep "^>" | cut -d " " -f1 | sed 's/>//g' > "index/mm_gencode_36/decoys.txt"
cat index/mm_gencode_36/transcripts.fa.gz index/mm_gencode_36/genome.fa.gz >index/mm_gencode_36/gentrome.fa.gz
salmon index -t index/mm_gencode_36/gentrome.fa.gz -d index/mm_gencode_36/decoys.txt -p 8 -i index/mm_gencode_36 --gencode
wget -c -O SRR14055936_1.fastq.gz https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR140/036/SRR14055936/SRR14055936_1.fastq.gz
wget -c -O SRR14055936_2.fastq.gz https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR140/036/SRR14055936/SRR14055936_2.fastq.gz
wget -c -O SRR14055935_1.fastq.gz https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR140/035/SRR14055935/SRR14055935_1.fastq.gz
wget -c -O SRR14055935_2.fastq.gz https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR140/035/SRR14055935/SRR14055935_2.fastq.gz
wget -c -O SRR14055934_1.fastq.gz https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR140/034/SRR14055934/SRR14055934_1.fastq.gz
wget -c -O SRR14055934_2.fastq.gz https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR140/034/SRR14055934/SRR14055934_2.fastq.gz
wget -c -O SRR14055933_1.fastq.gz https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR140/033/SRR14055933/SRR14055933_1.fastq.gz
wget -c -O SRR14055933_2.fastq.gz https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR140/033/SRR14055933/SRR14055933_2.fastq.gz
salmon quant -i index/mm_gencode_36 -p 12 -l A --validateMappings --gcBias -o SAMN18442664 -1 SRR14055936_1.fastq.gz -2 SRR14055936_2.fastq.gz
salmon quant -i index/mm_gencode_36 -p 12 -l A --validateMappings --gcBias -o SAMN18442665 -1 SRR14055935_1.fastq.gz -2 SRR14055935_2.fastq.gz
salmon quant -i index/mm_gencode_36 -p 12 -l A --validateMappings --gcBias -o SAMN18442666 -1 SRR14055934_1.fastq.gz -2 SRR14055934_2.fastq.gz
salmon quant -i index/mm_gencode_36 -p 12 -l A --validateMappings --gcBias -o SAMN18442667 -1 SRR14055933_1.fastq.gz -2 SRR14055933_2.fastq.gz
First, we load all necessary R packages for the downstream analysis, including tools for data import, differential expression, annotation, visualization, and functional enrichment.
library(tximport)
library(DESeq2)
library(EnsDb.Mmusculus.v79)
library(org.Mm.eg.db)
library(msigdbr)
library(ggplot2)
library(ggrepel)
library(pheatmap)
library(RColorBrewer)
library(clusterProfiler)
library(fgsea)
library(patchwork)
library(knitr)
Next, we specify the directory containing the Salmon output files,
define the sample identifiers, and assign experimental conditions
(Control vs. Treated). A color scheme for plotting is also defined. This
information is compiled into a colData dataframe, which
serves as the metadata input for DESeq2.
salmon_dir <- "../01_Salmon/mm_gencode_36/"
sample_names <- c("SAMN18442664", "SAMN18442665", "SAMN18442666", "SAMN18442667")
conditions <- factor(rep(c("Control", "Treated"), each = 2), levels = c("Control", "Treated"))
my_manual_colors <- c("Control" = "#377EB8", "Treated" = "#FF7F00")
quant_files <- file.path(salmon_dir, sample_names, "quant.sf.gz")
if (!all(file.exists(quant_files))) stop("Error: Not all Salmon quant.sf files found.")
names(quant_files) <- sample_names
colData <- data.frame(sample = sample_names, condition = conditions, row.names = sample_names)
kable(colData, caption = "Sample Metadata (colData)")
| sample | condition | |
|---|---|---|
| SAMN18442664 | SAMN18442664 | Control |
| SAMN18442665 | SAMN18442665 | Control |
| SAMN18442666 | SAMN18442666 | Treated |
| SAMN18442667 | SAMN18442667 | Treated |
To summarize transcript-level abundances to the gene level, a mapping
between transcript IDs and gene IDs is required. We create this map
using the EnsDb.Mmusculus.v79 annotation database, which
corresponds to the GRCm38/mm10 genome build used for quantification.
tryCatch({
txdb <- EnsDb.Mmusculus.v79
k <- keys(txdb, keytype = "TXNAME")
tx2gene <- AnnotationDbi::select(txdb, keys = k, keytype = "TXNAME", columns = "GENEID")
tx2gene <- tx2gene[, c("TXNAME", "GENEID")]
colnames(tx2gene) <- c("TXID", "GENEID")
tx2gene <- tx2gene[!duplicated(tx2gene$TXID),]
kable(head(tx2gene), caption = "Head of Transcript-to-Gene Map (tx2gene)")
}, error = function(e) { stop("Error creating tx2gene map: ", e$message) })
| TXID | GENEID |
|---|---|
| ENSMUST00000000001 | ENSMUSG00000000001 |
| ENSMUST00000000003 | ENSMUSG00000000003 |
| ENSMUST00000000010 | ENSMUSG00000020875 |
| ENSMUST00000000028 | ENSMUSG00000000028 |
| ENSMUST00000000033 | ENSMUSG00000048583 |
| ENSMUST00000000049 | ENSMUSG00000000049 |
Using the tximport package, we import the Salmon
quantification files (quant.sf.gz) and summarize the
transcript-level counts, abundance estimates (TPM), and effective
lengths to the gene level using the previously created
tx2gene map.
txi <- tximport(quant_files, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
The imported gene-level count data and the sample metadata
(colData) are used to construct a DESeqDataSet
object. This object is the central data structure for DESeq2 analyses,
containing counts, metadata, and the experimental design
(~ condition).
dds <- DESeqDataSetFromTximport(txi = txi, colData = colData, design = ~ condition)
Before differential expression analysis, quality control steps are performed to assess sample relationships and identify potential outliers.
PCA is applied to variance-stabilized transformed (VST) count data to visualize the similarity between samples in a reduced dimensional space. Samples belonging to the same experimental group are expected to cluster together.
vsd <- vst(dds, blind = TRUE)
pca_plot <- plotPCA(vsd, intgroup = "condition") +
ggtitle("PCA Plot of Samples (VST Counts)") +
geom_point(size=3) + scale_color_manual(values = my_manual_colors) + theme_bw()
print(pca_plot)
PCA plot of samples based on VST-transformed counts.
A heatmap visualizing the Euclidean distances between samples based on VST counts provides another perspective on sample similarity. Samples with similar expression profiles will cluster together.
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$sample, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix, clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists, col=colors,
main="Sample Distance Matrix (VST Counts)")
Heatmap of Euclidean distances between samples based on VST counts.
Genes with very low counts across all samples provide little information for differential expression testing and can interfere with some statistical approximations. We remove genes where the count is less than 10 in fewer than 2 samples (the size of the smallest group). The number of genes before and after filtering is tracked.
keep <- rowSums(counts(dds) >= 10) >= 2
num_before_filter <- nrow(dds)
dds <- dds[keep, ]
num_after_filter <- nrow(dds)
# Filtering summary: Kept num_after_filter out of num_before_filter genes.
The core differential expression analysis is performed using the
DESeq() function. This function estimates size factors for
normalization, estimates gene-wise dispersions, and fits a negative
binomial generalized linear model (GLM) to test for differences between
the “Treated” and “Control” conditions.
dds <- DESeq(dds)
The results of the differential expression test are extracted using
the results() function, specifying the comparison contrast
(“Treated” vs “Control”). An adjusted p-value threshold
(alpha) of 0.05 is used for significance counting in the
summary provided by the summary() function. Gene symbols
are added to the results table by mapping Ensembl Gene IDs using the
annotation database.
res <- results(dds, contrast = c("condition", "Treated", "Control"), alpha = 0.05)
summary(res)
##
## out of 13248 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 2626, 20%
## LFC < 0 (down) : 2096, 16%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res$SYMBOL <- mapIds(EnsDb.Mmusculus.v79, keys = rownames(res),
column = "SYMBOL", keytype = "GENEID", multiVals = "first")
res_df <- as.data.frame(res) %>%
rownames_to_column("ENSEMBL") %>%
as_tibble() %>%
arrange(padj)
# Check for missing symbols: sum(is.na(res_df$SYMBOL)) genes had missing symbols after mapping.
A volcano plot provides a useful visualization of the differential expression results, plotting the log2 fold change against the negative log10 adjusted p-value. Significantly up- and down-regulated genes are highlighted, and the top 20 most significant genes are labeled with their symbols (or Ensembl IDs if symbols are missing).
padj_threshold <- 0.05; log2fc_threshold <- 1.0
res_df_plot <- res_df %>%
mutate(significance_level = case_when(
padj < padj_threshold & log2FoldChange > log2fc_threshold ~ "Upregulated",
padj < padj_threshold & log2FoldChange < -log2fc_threshold ~ "Downregulated",
TRUE ~ "Not Significant" )) %>%
mutate(rank = row_number()) %>%
mutate(label_for_plot = case_when(
rank <= 20 & !is.na(SYMBOL) ~ SYMBOL,
rank <= 20 & is.na(SYMBOL) ~ ENSEMBL,
TRUE ~ NA_character_ ))
volcano_colors <- c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey60")
volcano_plot_top20 <- ggplot(res_df_plot, aes(x = log2FoldChange, y = -log10(padj))) +
geom_point(aes(color = significance_level), alpha = 0.7, size = 1.8) +
scale_color_manual(name = "Significance", values = volcano_colors) +
geom_text_repel(aes(label = label_for_plot), na.rm = TRUE, box.padding = 0.4, point.padding = 0.6,
segment.color = 'grey50', segment.size = 0.3, max.overlaps = 25, size = 3.0) +
geom_hline(yintercept = -log10(padj_threshold), linetype = "dashed", color = "grey40") +
geom_vline(xintercept = c(-log2fc_threshold, log2fc_threshold), linetype = "dashed", color = "grey40") +
labs(title = "Volcano Plot: Treated vs Control",
subtitle = "Top 20 significant genes labeled (Symbol or ENSEMBL ID)",
x = "Log2 Fold Change (Treated / Control)", y = "-Log10 Adjusted P-value") +
theme_bw(base_size = 12) +
theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
print(volcano_plot_top20)
Volcano plot showing log2 fold change vs. -log10 adjusted p-value. Top 20 significant genes are labeled.
The complete, unfiltered table of differential expression results for all genes that passed the initial low-count filter is presented below. This interactive table allows sorting, searching, and downloading of the full results.
final_results_table <- res_df %>%
select(ENSEMBL, SYMBOL, baseMean, log2FoldChange, lfcSE, stat, pvalue, padj) %>%
arrange(padj)
datatable(final_results_table,
caption = "Full Unfiltered Differential Expression Results (Treated vs Control)",
rownames = FALSE,
filter = 'top',
extensions = 'Buttons',
options = list(pageLength = 10,
scrollX = TRUE,
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel'))) %>%
formatRound(columns = c('baseMean'), digits = 1) %>%
formatRound(columns = c('log2FoldChange', 'lfcSE', 'stat'), digits = 3) %>%
formatSignif(columns = c('pvalue', 'padj'), digits = 3)
output_file <- "DE_Results_Treated_vs_Control_Unfiltered.csv"
write.csv(final_results_table, file = output_file, row.names = FALSE)
To understand the biological implications of the observed gene expression changes, we perform functional enrichment analyses.
GO analysis identifies Gene Ontology terms (Biological Process - BP, Molecular Function - MF, Cellular Component - CC) that are statistically over-represented within the lists of significantly up- and down-regulated genes.
First, we define lists of significant up-regulated and down-regulated genes based on adjusted p-value (< 0.05) and log2 fold change (> 1.0 or < -1.0) thresholds. A list of all tested genes (the “universe”) is also prepared. The number of genes in each list is determined.
padj_threshold <- 0.05; log2fc_threshold <- 1.0
up_genes <- final_results_table %>% filter(padj < padj_threshold, log2FoldChange > log2fc_threshold) %>% pull(ENSEMBL) %>% na.omit() %>% unique()
down_genes <- final_results_table %>% filter(padj < padj_threshold, log2FoldChange < -log2fc_threshold) %>% pull(ENSEMBL) %>% na.omit() %>% unique()
universe_genes <- final_results_table %>% pull(ENSEMBL) %>% na.omit() %>% unique()
# Summary: length(up_genes) Up, length(down_genes) Down, length(universe_genes) Universe genes for GO.
The enrichGO function from the
clusterProfiler package is used to perform the enrichment
analysis separately for up- and down-regulated genes against the
background universe. We use the org.Mm.eg.db database for
mouse annotations and Ensembl IDs as the key type. Enrichment results
are stored, checking if significant terms were found.
run_enrichment <- function(gene_list, universe_list, ontology_type = "ALL", description) {
# Running GO enrichment for description (length(gene_list) genes)
if (length(gene_list) < 5) { # Skipping enrichment, too few genes.
return(NULL)
}
tryCatch({
enrich_result <- enrichGO(gene=gene_list, universe=universe_list, OrgDb=org.Mm.eg.db, keyType="ENSEMBL", ont=ontology_type, pAdjustMethod="BH", pvalueCutoff=0.05, qvalueCutoff=0.10, readable=FALSE)
if (is.null(enrich_result) || nrow(enrich_result@result) == 0) { # No significant enrichment found.
return(NULL)
} else { # Found nrow(enrich_result@result) significantly enriched terms.
return(enrich_result)
}
}, error = function(e) { # Error during enrichment for description: e$message
return(NULL)
})
}
enrich_up <- run_enrichment(up_genes, universe_genes, ontology_type = "ALL", description = "Upregulated Genes")
enrich_down <- run_enrichment(down_genes, universe_genes, ontology_type = "ALL", description = "Downregulated Genes")
go_results_up_df <- if (!is.null(enrich_up)) as.data.frame(enrich_up) else NULL
go_results_down_df <- if (!is.null(enrich_down)) as.data.frame(enrich_down) else NULL
A lollipop plot is generated to visualize the top 10 significantly enriched GO terms (based on adjusted p-value) for each ontology category (BP, MF, CC) within both the up- and down-regulated gene sets, if significant terms were found.
combined_enrich_df <- NULL
if (!is.null(go_results_up_df)) combined_enrich_df <- bind_rows(combined_enrich_df, mutate(go_results_up_df, Regulation = "Upregulated"))
if (!is.null(go_results_down_df)) combined_enrich_df <- bind_rows(combined_enrich_df, mutate(go_results_down_df, Regulation = "Downregulated"))
if (!is.null(combined_enrich_df) && nrow(combined_enrich_df) > 0) {
top_terms_df <- combined_enrich_df %>% filter(p.adjust < 0.05) %>% group_by(Regulation, ONTOLOGY) %>% arrange(p.adjust) %>% slice_head(n = 10) %>% ungroup()
# Selected nrow(top_terms_df) top GO terms for plotting.
if (nrow(top_terms_df) > 0) {
top_terms_df <- top_terms_df %>%
mutate(TermLabel = paste0(strtrim(Description, 50), ifelse(nchar(Description) > 50, "...", ""))) %>%
arrange(Regulation, desc(-log10(p.adjust))) %>%
mutate(TermLabelOrdered = factor(TermLabel, levels = unique(TermLabel)))
ontology_colors <- c("BP" = "#1f78b4", "MF" = "#33a02c", "CC" = "#e31a1c")
lollipop_plot <- ggplot(top_terms_df, aes(x = -log10(p.adjust), y = TermLabelOrdered)) +
geom_segment(aes(yend = TermLabelOrdered, xend = 0), color = "grey50") +
geom_point(aes(color = ONTOLOGY, size = Count)) +
scale_color_manual(values = ontology_colors, name = "GO Ontology") +
facet_grid(Regulation ~ ., scales = "free_y", space = "free_y") +
labs(title = "Top GO Enrichment Terms (BP, MF, CC)", subtitle = "Top 10 significant terms per Ontology shown for Up/Down regulated genes",
x = "-Log10 Adjusted P-value", y = "GO Term Description", size = "Gene Count") +
theme_bw(base_size = 10) +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), axis.text.y = element_text(size = 8),
strip.text.y = element_text(angle = 0, face = "bold"), strip.background = element_rect(fill = "grey90", color = NA), panel.spacing = unit(1, "lines")) +
scale_size_continuous(range = c(3, 8))
print(lollipop_plot)
} # else { # No significant top terms found for lollipop plot. }
} # else { # No significant GO results found. Skipping GO plot. }
Top 10 significant GO terms (BP, MF, CC) for up- and down-regulated genes.
Interpretation: The GO enrichment analysis suggests that ATF3 plays a critical role in vascular smooth muscle cells by influencing the expression of genes involved in cell junction organization, plasma membrane components, signaling pathways, extracellular matrix regulation, and processes related to blood vessel function and development. The downregulation of genes associated with cell junctions and plasma membrane structures implies that ATF3 might be necessary for maintaining cellular integrity and cell-cell interactions. Conversely, the upregulation of genes involved in inflammation, extracellular matrix remodeling, and vascular processes suggests that ATF3 normally acts to suppress these pathways.
GSEA is performed to determine whether predefined sets of genes (e.g., KEGG pathways, Hallmark gene sets) show statistically significant, concordant differences between the two biological states (Treated vs. Control). Unlike standard over-representation analysis, GSEA considers the rank of all tested genes, not just those passing an arbitrary significance threshold.
A ranked list of all tested genes is created based on the
stat column (Wald statistic) from the DESeq2 results.
Ensembl IDs are mapped to Entrez/NCBI Gene IDs, as these are commonly
used in pathway databases like MSigDB. Genes are ranked in descending
order by the statistic. The number of unique Entrez IDs in the final
list is determined.
# --- Preparing Ranked Gene List for GSEA ---
gsea_input_df <- final_results_table %>% filter(!is.na(stat)) %>% select(ENSEMBL, stat)
# Mapping Ensembl IDs to Entrez/NCBI IDs...
entrez_ids <- mapIds(org.Mm.eg.db, keys = gsea_input_df$ENSEMBL, column = "ENTREZID", keytype = "ENSEMBL", multiVals = "first")
gsea_input_df <- gsea_input_df %>% mutate(ENTREZID = entrez_ids) %>% filter(!is.na(ENTREZID)) %>% group_by(ENTREZID) %>% filter(abs(stat) == max(abs(stat))) %>% slice(1) %>% ungroup()
ranked_gene_list <- gsea_input_df %>% arrange(desc(stat)) %>% pull(stat, name = ENTREZID)
names(ranked_gene_list) <- as.character(names(ranked_gene_list))
# Prepared ranked list for GSEA with length(ranked_gene_list) unique Entrez/NCBI IDs.
We perform GSEA using the KEGG pathway collection from the Molecular Signatures Database (MSigDB).
KEGG pathways (Collection C2, subcollection CP:KEGG) for Mus
musculus are fetched from MSigDB using the msigdbr
package and formatted into a list suitable for the fgsea
package. The process tracks the number of pathways retrieved and
formatted.
# --- Loading KEGG Pathways from MSigDB ---
kegg_pathways_list_msigdb <- NULL
tryCatch({
msigdb_c2_all_df <- msigdbr(species = "Mus musculus", collection = "C2")
if (is.null(msigdb_c2_all_df) || nrow(msigdb_c2_all_df) == 0) stop("Failed to fetch MSigDB C2 data.")
# Retrieved nrow(msigdb_c2_all_df) entries from MSigDB C2.
subcat_col_name <- "gs_subcollection"; entrez_col_name <- "ncbi_gene"; geneset_col_name <- "gs_name"
required_cols <- c(subcat_col_name, entrez_col_name, geneset_col_name)
if(!all(required_cols %in% colnames(msigdb_c2_all_df))) stop("Essential columns missing.")
kegg_identifier_to_use <- NULL; preferred_kegg_id <- "CP:KEGG"; legacy_kegg_id <- "CP:KEGG_LEGACY"
available_subcats_c2 <- unique(msigdb_c2_all_df[[subcat_col_name]])
if (preferred_kegg_id %in% available_subcats_c2) kegg_identifier_to_use <- preferred_kegg_id
else if (legacy_kegg_id %in% available_subcats_c2) {
kegg_identifier_to_use <- legacy_kegg_id
# Note: Using legacy KEGG identifier 'kegg_identifier_to_use'.
} else warning("KEGG identifiers not found. GSEA running on ALL C2 pathways.")
msigdb_kegg_df <- msigdb_c2_all_df
if (!is.null(kegg_identifier_to_use)) {
# Filtering for identifier: kegg_identifier_to_use
msigdb_kegg_df <- msigdb_c2_all_df %>% filter(.data[[subcat_col_name]] == kegg_identifier_to_use)
# Filtered to nrow(msigdb_kegg_df) entries.
if (nrow(msigdb_kegg_df) == 0) warning("Filtering resulted in zero pathways.")
}
if (nrow(msigdb_kegg_df) > 0) {
kegg_pathways_list_msigdb <- msigdb_kegg_df %>%
filter(!is.na(.data[[entrez_col_name]])) %>%
mutate(across(all_of(entrez_col_name), as.character)) %>%
split(x = .[[entrez_col_name]], f = .[[geneset_col_name]]) %>% lapply(unique)
if (length(kegg_pathways_list_msigdb) > 0) { # Formatted length(kegg_pathways_list_msigdb) KEGG pathways for fgsea.
} else stop("Failed to format KEGG pathways.")
} else stop("No KEGG pathway data remaining.")
}, error = function(e) { stop("Error during MSigDB KEGG processing: ", e$message) })
## NULL
The fgsea function is executed using the ranked gene
list and the prepared KEGG pathway list. Permutation testing is used to
assess significance.
# --- Running GSEA (KEGG) using fgsea ---
fgsea_results_kegg_df <- NULL
if(!is.null(kegg_pathways_list_msigdb) && length(kegg_pathways_list_msigdb) > 0) {
set.seed(1234)
fgsea_results_kegg <- fgsea(pathways = kegg_pathways_list_msigdb, stats = ranked_gene_list,
minSize = 15, maxSize = 500, nPermSimple = 10000)
fgsea_results_kegg_df <- as_tibble(fgsea_results_kegg) %>% arrange(padj)
# GSEA (KEGG) complete.
} # else { # Skipping GSEA (KEGG) as pathway list is empty. }
A bar plot summarizes the results, showing the Normalized Enrichment Scores (NES) for the top significantly enriched KEGG pathways (positively enriched in Treated vs. Control have positive NES, negatively enriched have negative NES).
# --- Generating GSEA summary plot (KEGG) ---
if (!is.null(fgsea_results_kegg_df) && nrow(fgsea_results_kegg_df) > 0) {
pathway_padj_threshold <- 0.10; top_n_pathways <- 10
sig_fgsea_results_kegg <- fgsea_results_kegg_df %>% filter(padj < pathway_padj_threshold) %>% mutate(pathway_name = as.character(pathway)) %>% arrange(padj)
top_up_pathways_kegg <- sig_fgsea_results_kegg %>% filter(NES > 0) %>% slice_head(n = top_n_pathways)
top_down_pathways_kegg <- sig_fgsea_results_kegg %>% filter(NES < 0) %>% arrange(NES) %>% slice_head(n = top_n_pathways)
top_pathways_kegg_df <- bind_rows(top_up_pathways_kegg, top_down_pathways_kegg)
# Selected nrow(top_up_pathways_kegg) up & nrow(top_down_pathways_kegg) down KEGG pathways (padj < pathway_padj_threshold).
if (nrow(top_pathways_kegg_df) > 0) {
top_pathways_kegg_df <- top_pathways_kegg_df %>% mutate(pathway_name_ordered = factor(pathway_name, levels = pathway_name[order(NES)]))
gsea_plot_kegg <- ggplot(top_pathways_kegg_df, aes(x = NES, y = pathway_name_ordered)) +
geom_col(aes(fill = NES > 0), show.legend = FALSE) + scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "blue")) +
labs(title = "Top Enriched KEGG Pathways from GSEA", subtitle = paste("Top", nrow(top_up_pathways_kegg), "Up &", nrow(top_down_pathways_kegg), "Down Pathways (padj <", pathway_padj_threshold, ")"),
x = "Normalized Enrichment Score (NES)", y = NULL) +
theme_minimal(base_size = 10) + theme(plot.title = element_text(hjust = 0.5, face="bold"), plot.subtitle = element_text(hjust = 0.5), axis.text.y = element_text(size = 8), panel.grid.major.y = element_blank()) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50")
print(gsea_plot_kegg)
} # else { # No significant KEGG pathways for summary plot. }
} # else { # GSEA (KEGG) results not available for plotting. }
Bar plot of Normalized Enrichment Scores (NES) for top KEGG pathways.
Enrichment plots show the running enrichment score for specific pathways across the ranked gene list, illustrating how the enrichment is achieved (e.g., driven by top-ranked genes). Plots are generated for the top significant KEGG pathways identified previously.
# --- Generating GSEA Enrichment Plots (KEGG) ---
if (exists("top_pathways_kegg_df") && nrow(top_pathways_kegg_df) > 0) {
pathways_to_plot_kegg <- top_pathways_kegg_df$pathway_name
list_of_enrichment_plots_kegg <- list()
# Generating individual KEGG enrichment plots...
for (pathway_name in pathways_to_plot_kegg) {
if (pathway_name %in% names(kegg_pathways_list_msigdb)) {
current_plot <- plotEnrichment(pathway = kegg_pathways_list_msigdb[[pathway_name]], stats = ranked_gene_list) +
ggtitle(pathway_name) + theme(plot.title = element_text(size = 8, hjust = 0.5))
list_of_enrichment_plots_kegg[[pathway_name]] <- current_plot
} else warning("KEGG Pathway '", pathway_name, "' not found. Skipping plot.")
}
if (length(list_of_enrichment_plots_kegg) > 0) {
# Combining KEGG plots...
num_plots <- length(list_of_enrichment_plots_kegg); num_cols <- if (num_plots <= 4) 2 else if (num_plots <= 10) 3 else 4
combined_gsea_enrichment_plot_kegg <- wrap_plots(list_of_enrichment_plots_kegg, ncol = num_cols) + plot_annotation(title = "GSEA Enrichment Plots for Top KEGG Pathways")
print(combined_gsea_enrichment_plot_kegg)
} # else { # No KEGG enrichment plots generated. }
} # else { # No significant KEGG pathways for enrichment plots. }
GSEA enrichment plots for top significant KEGG pathways.
Interpretation: The KEGG pathway enrichment analysis reveals that ATF3 knockdown in mouse vascular smooth muscle cells leads to significant alterations in several key cellular processes. The upregulation of pathways related to protein synthesis (Ribosome), metabolism (Carbon metabolism, Glycolysis/Gluconeogenesis), cell adhesion (Focal adhesion), stress response (Protein processing in endoplasmic reticulum), cytoskeletal regulation (Regulation of actin cytoskeleton), and potentially inflammation (Phagosome, Complement and coagulation cascades) suggests that ATF3 might normally suppress these processes. Conversely, the downregulation of pathways involved in cell cycle and DNA replication indicates that ATF3 may play a positive role in promoting cell proliferation in these cells.
We repeat the GSEA process using the Hallmark gene sets from MSigDB, which represent well-defined biological states or processes.
Hallmark gene sets (Collection H) for Mus musculus are
fetched from MSigDB and formatted for fgsea. The number of
pathways successfully formatted is tracked.
# --- Loading Hallmark Gene Sets from MSigDB ---
hallmark_pathways_list_msigdb <- NULL
tryCatch({
msigdb_h_df <- msigdbr(species = "Mus musculus", collection = "H")
if (is.null(msigdb_h_df) || nrow(msigdb_h_df) == 0) stop("Failed to fetch MSigDB Hallmark (H) data.")
# Successfully retrieved nrow(msigdb_h_df) entries from MSigDB Collection H.
entrez_col_name <- "ncbi_gene"; geneset_col_name <- "gs_name"
required_cols <- c(entrez_col_name, geneset_col_name)
if(!all(required_cols %in% colnames(msigdb_h_df))) stop("Essential columns missing from Hallmark data.")
if (nrow(msigdb_h_df) > 0) {
hallmark_pathways_list_msigdb <- msigdb_h_df %>%
filter(!is.na(.data[[entrez_col_name]])) %>%
mutate(across(all_of(entrez_col_name), as.character)) %>%
split(x = .[[entrez_col_name]], f = .[[geneset_col_name]]) %>% lapply(unique)
if (length(hallmark_pathways_list_msigdb) > 0) { # Formatted length(hallmark_pathways_list_msigdb) Hallmark pathways for fgsea.
} else stop("Failed to format Hallmark pathways.")
} else stop("No Hallmark pathway data found.")
}, error = function(e) { stop("Error during MSigDB Hallmark processing: ", e$message) })
## NULL
The fgsea function is executed using the ranked gene
list and the Hallmark pathway list.
# --- Running GSEA (Hallmark) using fgsea ---
fgsea_results_hallmark_df <- NULL
if(!is.null(hallmark_pathways_list_msigdb) && length(hallmark_pathways_list_msigdb) > 0) {
set.seed(1234)
fgsea_results_hallmark <- fgsea(pathways = hallmark_pathways_list_msigdb, stats = ranked_gene_list,
minSize = 10, maxSize = 500, nPermSimple = 10000)
fgsea_results_hallmark_df <- as_tibble(fgsea_results_hallmark) %>% arrange(padj)
# GSEA (Hallmark) complete.
} # else { # Skipping GSEA (Hallmark) as pathway list is empty or failed to load. }
A bar plot shows the NES for the top significantly enriched Hallmark pathways.
# --- Generating GSEA summary plot (Hallmark) ---
if (!is.null(fgsea_results_hallmark_df) && nrow(fgsea_results_hallmark_df) > 0) {
pathway_padj_threshold <- 0.10; top_n_pathways <- 10
sig_fgsea_results_hallmark <- fgsea_results_hallmark_df %>% filter(padj < pathway_padj_threshold) %>% mutate(pathway_name = as.character(pathway)) %>% arrange(padj)
top_up_pathways_hallmark <- sig_fgsea_results_hallmark %>% filter(NES > 0) %>% slice_head(n = top_n_pathways)
top_down_pathways_hallmark <- sig_fgsea_results_hallmark %>% filter(NES < 0) %>% arrange(NES) %>% slice_head(n = top_n_pathways)
top_pathways_hallmark_df <- bind_rows(top_up_pathways_hallmark, top_down_pathways_hallmark)
# Selected nrow(top_up_pathways_hallmark) up & nrow(top_down_pathways_hallmark) Hallmark pathways (padj < pathway_padj_threshold).
if (nrow(top_pathways_hallmark_df) > 0) {
top_pathways_hallmark_df <- top_pathways_hallmark_df %>%
mutate(pathway_label = gsub("HALLMARK_", "", pathway_name)) %>% mutate(pathway_label = gsub("_", " ", pathway_label)) %>%
mutate(pathway_label_ordered = factor(pathway_label, levels = pathway_label[order(NES)]))
gsea_plot_hallmark <- ggplot(top_pathways_hallmark_df, aes(x = NES, y = pathway_label_ordered)) +
geom_col(aes(fill = NES > 0), show.legend = FALSE) + scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "blue")) +
labs(title = "Top Enriched Hallmark Pathways from GSEA", subtitle = paste("Top", nrow(top_up_pathways_hallmark), "Up &", nrow(top_down_pathways_hallmark), "Down Pathways (padj <", pathway_padj_threshold, ")"),
x = "Normalized Enrichment Score (NES)", y = NULL) +
theme_minimal(base_size = 10) + theme(plot.title = element_text(hjust = 0.5, face="bold"), plot.subtitle = element_text(hjust = 0.5), axis.text.y = element_text(size = 8), panel.grid.major.y = element_blank()) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50")
print(gsea_plot_hallmark)
} # else { # No significant Hallmark pathways for summary plot. }
} # else { # GSEA (Hallmark) results not available for plotting. }
Bar plot of Normalized Enrichment Scores (NES) for top Hallmark pathways.
Enrichment plots are generated for the top significant Hallmark pathways.
# --- Generating GSEA Enrichment Plots (Hallmark) ---
if (exists("top_pathways_hallmark_df") && nrow(top_pathways_hallmark_df) > 0) {
pathways_to_plot_hallmark <- top_pathways_hallmark_df$pathway
list_of_enrichment_plots_hallmark <- list()
# Generating individual Hallmark enrichment plots...
for (pathway_name in pathways_to_plot_hallmark) {
if (pathway_name %in% names(hallmark_pathways_list_msigdb)) {
plot_title <- gsub("HALLMARK_", "", pathway_name); plot_title <- gsub("_", " ", plot_title)
current_plot <- plotEnrichment(pathway = hallmark_pathways_list_msigdb[[pathway_name]], stats = ranked_gene_list) +
ggtitle(plot_title) + theme(plot.title = element_text(size = 8, hjust = 0.5))
list_of_enrichment_plots_hallmark[[pathway_name]] <- current_plot
} else warning("Hallmark Pathway '", pathway_name, "' not found. Skipping plot.")
}
if (length(list_of_enrichment_plots_hallmark) > 0) {
# Combining Hallmark plots...
num_plots <- length(list_of_enrichment_plots_hallmark); num_cols <- if (num_plots <= 4) 2 else if (num_plots <= 10) 3 else 4
combined_gsea_enrichment_plot_hallmark <- wrap_plots(list_of_enrichment_plots_hallmark, ncol = num_cols) + plot_annotation(title = "GSEA Enrichment Plots for Top Hallmark Pathways")
print(combined_gsea_enrichment_plot_hallmark)
} # else { # No Hallmark enrichment plots generated. }
} # else { # No significant Hallmark pathways for enrichment plots. }
GSEA enrichment plots for top significant Hallmark pathways.
Interpretation: The Hallmark pathway enrichment analysis provides a broader view of the biological states and processes affected by ATF3 knockdown in vascular smooth muscle cells. The results suggest that ATF3 plays a role in suppressing pathways related to inflammation (TNFA signaling via NFkB, Interferon Gamma Response), cellular stress (p53 pathway, Unfolded Protein Response, UV Response Down), and potentially the epithelial-mesenchymal transition. Furthermore, ATF3 appears to be involved in promoting cell proliferation, as its knockdown leads to the downregulation of pathways critical for cell cycle progression (Mitotic Spindle, E2F Targets, G2M Checkpoint) and DNA replication. The downregulation of Estrogen Response Early suggests a potential link between ATF3 and hormonal signaling in these cells.
This section provides access to the complete results tables for the Gene Ontology and GSEA analyses performed earlier. Use the tabs below to navigate between the different result sets. The tables are interactive, allowing sorting, searching, and pagination. Numeric columns are formatted for readability.
if (!is.null(go_results_up_df) && nrow(go_results_up_df) > 0) {
datatable(go_results_up_df,
caption = "Full GO Enrichment Results for Upregulated Genes",
rownames = FALSE, filter = 'top', extensions = 'Buttons',
options = list(pageLength = 10, scrollX = TRUE, dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel'))) %>%
formatSignif(columns = c('pvalue', 'p.adjust', 'qvalue'), digits = 3)
} else {
# No significant GO enrichment results found for upregulated genes.
knitr::asis_output("No significant GO enrichment results found for upregulated genes.") # Use this for plain text output in Rmd
}
if (!is.null(go_results_down_df) && nrow(go_results_down_df) > 0) {
datatable(go_results_down_df,
caption = "Full GO Enrichment Results for Downregulated Genes",
rownames = FALSE, filter = 'top', extensions = 'Buttons',
options = list(pageLength = 10, scrollX = TRUE, dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel'))) %>%
formatSignif(columns = c('pvalue', 'p.adjust', 'qvalue'), digits = 3)
} else {
# No significant GO enrichment results found for downregulated genes.
knitr::asis_output("No significant GO enrichment results found for downregulated genes.")
}
if (exists("fgsea_results_kegg_df") && !is.null(fgsea_results_kegg_df) && nrow(fgsea_results_kegg_df) > 0) {
gsea_display_kegg_df <- fgsea_results_kegg_df %>%
select(pathway, pval, padj, ES, NES, size, leadingEdge) %>%
arrange(padj)
datatable(gsea_display_kegg_df,
caption = "Full GSEA Results for KEGG Pathways",
rownames = FALSE, filter = 'top', extensions = 'Buttons',
options = list(pageLength = 10, scrollX = TRUE, dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel'))) %>%
formatSignif(columns = c('pval', 'padj'), digits = 3) %>%
formatRound(columns = c('ES', 'NES'), digits = 3)
} else {
# GSEA (KEGG) results are not available.
knitr::asis_output("GSEA (KEGG) results are not available.")
}
if (exists("fgsea_results_hallmark_df") && !is.null(fgsea_results_hallmark_df) && nrow(fgsea_results_hallmark_df) > 0) {
gsea_display_hallmark_df <- fgsea_results_hallmark_df %>%
select(pathway, pval, padj, ES, NES, size, leadingEdge) %>%
mutate(pathway_display = gsub("HALLMARK_", "", pathway)) %>%
select(pathway_display, pval, padj, ES, NES, size, leadingEdge) %>%
arrange(padj)
datatable(gsea_display_hallmark_df,
caption = "Full GSEA Results for Hallmark Gene Sets",
rownames = FALSE, filter = 'top', extensions = 'Buttons',
options = list(pageLength = 10, scrollX = TRUE, dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel'))) %>%
formatSignif(columns = c('pval', 'padj'), digits = 3) %>%
formatRound(columns = c('ES', 'NES'), digits = 3)
} else {
# GSEA (Hallmark) results are not available.
knitr::asis_output("GSEA (Hallmark) results are not available.")
}